import os
import pandas as pd
import csv
import numpy as np
import math
from matplotlib.lines import Line2D
import matplotlib as mpl
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
import random
from scipy.spatial import distance
from scipy import stats
from sklearn import manifold
from sklearn.decomposition import PCA
import statsmodels.stats.multitest as sm
This file includes the results of a microbiome analysis performed on samples taken from four individuals that were originally used to determine the “Impact of DNA source on genetic variant detection from human whole-genome sequencing data”.
This included blood, saliva and buccal samples taken from four individuals (blood samples were taken at a different time than saliva and buccal samples). Additionally, a methylation-based enrichment for eukaryotic DNA was performed on the saliva and buccal samples.
Sections not completed:
- Absolute number of reads between different reference databases
- The intersection between different reference databases
- Functional profiling using HUMAnN2 (if I want to look more into the families that are differentially abundant)
- HUMAnN3 taxonomy
- Functional profiling using HUMAnN3
- Functional profiling using MEGAHIT and anvi’o
- Some kind of functional richness?
Fastq.gz files were downloaded from the ENA database, project accession number PRJNA523344
Kneaddata was used for quality control and removal of human sequences. This included:
- Trimmomatic 0.39: “SLIDINGWINDOW:4:20 MINLEN:50”
- Bowtie2 with the GRCh38_PhiX database (to remove human and PhiX reads): “–fast –dovetail”
#3
#set up colors function (to get up to 120 colors, but with up to 40 unique colors)
def get_cols(num):
colormap_20, colormap_40b, colormap_40c = mpl.cm.get_cmap('tab20', 256), mpl.cm.get_cmap('tab20b', 256), mpl.cm.get_cmap('tab20c', 256)
norm, norm2 = mpl.colors.Normalize(vmin=0, vmax=19), mpl.colors.Normalize(vmin=20, vmax=39)
m1, m2, m3 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap_20), mpl.cm.ScalarMappable(norm=norm, cmap=colormap_40b), mpl.cm.ScalarMappable(norm=norm2, cmap=colormap_40c)
colors_20 = [m1.to_rgba(a) for a in range(20)]
colors_40 = [m2.to_rgba(a) for a in range(20)]+[m3.to_rgba(a) for a in range(20,40)]
if num < 21: return colors_20
elif num < 41: return colors_40
else: return colors_40+colors_40+colors_40
#and colors and shapes for different participants and body sites
colors_dict, shapes_dict = {'Blood':'#900C3F', 'Saliva':'#016F85', 'Buccal':'#ff8300', 'Saliva_euk':'#02aed1', 'Buccal_euk':'#FFC300'}, {'Huref':'o', 'PGPC-0002':'^', 'PGPC-0005':'*', 'PGPC-0006':'s', 'PGPC-0050':'p'}
#4
#get numbers of reads for different steps
reads = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/read_counts.txt', sep='\t', index_col=3, header=0)
participant_dict, site_dict, full_name_dict = {}, {}, {}
samples = list(reads.index.values)
for s in samples:
participant_dict[s] = reads.loc[s, 'Participant']
site_dict[s] = reads.loc[s, 'Body site']
full_name_dict[s] = reads.loc[s, 'Participant']+' '+reads.loc[s, 'Body site']
total_reads = pd.DataFrame(reads.loc[:, 'cat_reads'])
sample_names = [participant_dict[name]+' '+site_dict[name] for name in samples]
colors = [colors_dict[s] for s in list(reads.loc[:, 'Body site'].values)]
shapes = [shapes_dict[s] for s in list(reads.loc[:, 'Participant'].values)]
#5
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Reads kept (%)')
plt.xlim([-0.5,20.5])
plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads kept (%)')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()
#6
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.xlim([-0.5,20.5])
plt.ylabel('Reads remaining')
plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads remaining')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()
#7
reads_remain = reads.loc[:, ['Percentage', 'cat_reads']].rename(index=full_name_dict, columns = {'cat_reads':'Number'})
#8
py$reads_remain %>%
kable() %>%
kable_styling()
| Percentage | Number | |
|---|---|---|
| Huref Blood | 0.2319067 | 1087311 |
| PGPC-0002 Blood | 0.5748318 | 3321664 |
| PGPC-0005 Blood | 0.5149286 | 1903482 |
| PGPC-0006 Blood | 0.7146067 | 3593441 |
| PGPC-0050 Blood | 0.6870164 | 2940390 |
| PGPC-0002 Saliva | 3.9289510 | 16561343 |
| PGPC-0005 Saliva | 55.6458826 | 244058333 |
| PGPC-0006 Saliva | 13.1608121 | 60497393 |
| PGPC-0050 Saliva | 56.3700720 | 257049004 |
| PGPC-0002 Buccal | 2.8112226 | 11462781 |
| PGPC-0005 Buccal | 2.8202504 | 12751614 |
| PGPC-0006 Buccal | 5.1521560 | 22434503 |
| PGPC-0050 Buccal | 2.4297107 | 10667011 |
| PGPC-0002 Saliva_euk | 1.2333408 | 5345888 |
| PGPC-0005 Saliva_euk | 8.3104464 | 38138382 |
| PGPC-0006 Saliva_euk | 1.9067564 | 8786635 |
| PGPC-0050 Saliva_euk | 5.6187272 | 25037552 |
| PGPC-0002 Buccal_euk | 0.9385856 | 4380036 |
| PGPC-0005 Buccal_euk | 1.2119820 | 5584415 |
| PGPC-0006 Buccal_euk | 1.5567291 | 7232280 |
| PGPC-0050 Buccal_euk | 1.3836141 | 6382288 |
The taxonomy has been profiled using:
1. HUMAnN
- HUMAnN2 and MetaPhlAn2
- HUMAnN3 and MetaPhlAn3
2. Kraken2 with Bracken
- GTDB (no confidence parameter set) -
using the database constructed using Struo, release 89 - GTDB (confidence = 0.1)
- Minikraken v1 (no human genome, no confidence parameter set)
- Minikraken v1 (no human genome, confidence = 0.1)
- Minikraken v2 (with human genome, no confidence parameter set)
- Minikraken v2 (with human genome, confidence = 0.1)
- RefSeq Complete v93 (no confidence parameter set)
- RefSeq Complete v93 (confidence = 0.1)
#9
#get the taxonomy file and sort it to strain and genus level
taxa = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/metaphlan_merged.tsv', sep='\t', header=0, index_col=0)
tax_names = list(taxa.index.values)
keeping = []
for a in range(len(tax_names)):
if 't__' in tax_names[a]:
keeping.append(True)
elif 'unclassified' in tax_names[a]:
keeping.append(True)
else:
keeping.append(False)
strain = taxa.loc[keeping, :]
strain_names = list(strain.index.values)
strain_dict = {}
for i in range(len(strain_names)):
strain_dict[strain_names[i]] = strain_names[i].split('|s__')[0].split('|g__')[1]
genus = strain.rename(index=strain_dict)
genus = genus.groupby(by=genus.index, axis=0).sum()
#10
#define the function that calculates the nmds plots
def transform_for_NMDS(df, dist_met='braycurtis'):
X = df.iloc[0:].values
y = df.iloc[:,0].values
seed = np.random.RandomState(seed=3)
X_true = X
similarities = distance.cdist(X_true, X_true, dist_met)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
dissimilarity="precomputed", n_jobs=1)
#print(similarities)
pos = mds.fit(similarities).embedding_
nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
dissimilarity="precomputed", random_state=seed, n_jobs=1,
n_init=1)
npos = nmds.fit_transform(similarities, init=pos)
# Rescale the data
pos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((pos ** 2).sum())
npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())
# Rotate the data
clf = PCA()
X_true = clf.fit_transform(X_true)
pos = clf.fit_transform(pos)
npos = clf.fit_transform(npos)
return pos, npos, nmds.stress_
strain_t = strain.transpose()
genus_t = genus.transpose()
handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]
#11
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'braycurtis')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'braycurtis')
plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()
#12
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'euclidean')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'euclidean')
plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()
#13
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'jaccard')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'jaccard')
plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a], color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()
Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Kingdom level for each sample.
#14
plt.figure(figsize=(7,5))
ax1 = plt.subplot(111)
plt.bar(list(taxa.columns.values), taxa.loc['k__Viruses', :].values, color='#C70039', edgecolor='k')
plt.bar(list(taxa.columns.values), taxa.loc['k__Bacteria', :].values, bottom=taxa.loc['k__Viruses', :].values, color='#026B81', edgecolor='k')
#plt.xticks(list(taxa.columns.values), sample_names, rotation=90)
empty = []
for x in range(0,21):
empty.append('')
ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
handles = [Patch(facecolor='#C70039', edgecolor='k', label='Viruses'), Patch(facecolor='#026B81', edgecolor='k', label='Bacteria')]
plt.legend(handles=handles, bbox_to_anchor=(1,1.05))
plt.tight_layout()
plt.show()
Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Genus level for each sample. Genera with below 1% maximum relative abundance have been removed.
#15
genus = genus[genus.max(axis=1) > 1]
genera = list(genus.index.values)
plt.figure(figsize=(10,5))
ax1 = plt.subplot(111)
gen_colors = get_cols(len(genus.index.values))
handles = []
for g in range(len(genera)):
this_gen = genus.loc[genera[g], :].values
if g == 0:
ax1.bar(list(genus.columns.values), this_gen, color=gen_colors[g], edgecolor='k')
total = this_gen
else:
ax1.bar(list(genus.columns.values), this_gen, bottom=total, color=gen_colors[g], edgecolor='k')
total = this_gen+total
handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=genera[g]))
empty = []
for x in range(0,21):
empty.append('')
ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
plt.legend(handles=handles, bbox_to_anchor=(1,1.05), ncol=2)
plt.tight_layout()
plt.show()
#16
#get all samples into dataframes based on the database that they use
folders = sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2'))
del folders[0]
kraken_columns = {0:'Percent fragments clade', 1:'Number fragments clade', 2:'Number fragments taxon', 3:'Level', 4:'NCBI ID', 5:'Taxon name'}
kraken_all_db, bracken_all_db, all_domains = [], [], {}
for fol in folders:
if fol == 'db_genera' or fol == 'db_genera_in_common':
continue
if not os.path.isdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol):
continue
bracken, kraken, bracken_kreport = [], [], []
bracken_pd, kraken_pd = [], []
for fi in sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol)):
if fi[-7:] == 'bracken':
bracken.append(fi)
elif fi[-7:] == 'kreport' and 'bracken' not in fi:
kraken.append(fi)
elif fi[-7:] == 'kreport':
bracken_kreport.append(fi)
for bk in bracken_kreport:
with open('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+bk, 'rU') as f:
bk = []
domains = {}
this_domain, domain_name = [], ''
for row in csv.reader(f, delimiter='\t'):
bk.append(row)
row[5] = row[5].lstrip()
if row[3] == 'D':
if domain_name != '':
domains[domain_name] = this_domain
this_domain, domain_name = [], row[5]
else:
if row[3] != 'R' and row[3] != 'U' and 'D' not in row[3]:
this_domain.append(row[5])
domains[domain_name] = this_domain
for domain in domains:
if domain in all_domains:
all_domains[domain] = list(set(all_domains[domain]+domains[domain]))
else:
all_domains[domain] = list(set(domains[domain]))
for b in bracken:
if len(b) > 22:
continue
sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+b, sep='\t', header=0, index_col=0)
b = b.replace('_150', '')
sample.drop(['taxonomy_id', 'taxonomy_lvl', 'kraken_assigned_reads', 'added_reads', 'fraction_total_reads'], axis=1, inplace=True)
sample.rename(columns={'new_est_reads':b[:-8]}, inplace=True)
bracken_pd.append(sample)
for k in kraken:
sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+k, sep='\t', header=None, index_col=3)
sample = sample.loc[['U', 'D'], :]
sample = sample.rename(columns=kraken_columns).drop(['Number fragments taxon', 'NCBI ID'], axis=1).rename(columns={'Percent fragments clade':k[:-8]+'_percent', 'Number fragments clade':k[:-8]+'_reads'}).set_index('Taxon name')
taxa = list(sample.index.values)
taxa_dict = {}
for t in taxa:
taxa_dict[t] = t.replace(' ', '')
sample = sample.rename(index=taxa_dict)
kraken_pd.append(sample)
bracken = pd.concat(bracken_pd, join='outer')
kraken = pd.concat(kraken_pd, join='outer')
kraken = kraken.rename(index={'d__Bacteria':'Bacteria', 'd__Archaea':'Archaea'})
kraken = kraken.groupby(by=kraken.index, axis=0).sum()
bracken = bracken.groupby(by=bracken.index, axis=0).sum().fillna(value=0)
kraken_all_db.append(kraken), bracken_all_db.append(bracken)
#17
x1 = [x for x in range(21)]
x2 = [x+0.3 for x in range(21)]
tax_plotting = ['Archaea', 'Bacteria', 'Eukaryota', 'Viruses', 'unclassified']
color_plotting = ['#EDBB99', '#5499C7', '#7DCEA0', '#F7DC6F', '#CCD1D1']
tax_paper = ['Bacteria', 'Eukaryota', 'Other', 'Unclassified']
color_paper = ['#5499C7', '#7DCEA0', '#CD6155', '#CCD1D1']
from_paper = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/from_paper.csv', header=0, index_col=0)
def get_summary_reads(kraken_db):
fig = plt.figure(figsize=(15,15))
ax1, ax2, ax3, ax4, ax5 = plt.subplot(321), plt.subplot(322), plt.subplot(323), plt.subplot(324), plt.subplot(325)
ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
ax5.set_title('From paper')
ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
axs = [ax1, ax2, ax3, ax4, ax5]
for s in range(len(samples)):
if s == 0:
continue
bottom = 0
for t in range(len(tax_paper)):
ax5.bar(x1[s], from_paper.loc[tax_paper[t], samples[s]], bottom=bottom, color=color_paper[t], edgecolor='k', width=0.6)
bottom += from_paper.loc[tax_paper[t], samples[s]]
for db in range(len(kraken_db)):
ax_using = ax_plot[db]
x = x_plot[db]
db = kraken_db[db]
handles = []
for tax in range(len(tax_plotting)):
handles.append(Patch(facecolor=color_plotting[tax], edgecolor='k', label=tax_plotting[tax]))
tax = tax_plotting[tax]
if tax not in list(db.index.values):
db.loc[tax] = [0 for i in range(db.shape[1])]
handles.append(Patch(facecolor=color_paper[2], edgecolor='k', label='Other'))
db = db.fillna(value=0)
for s in range(len(samples)):
bottom = 0
for t in range(len(tax_plotting)):
prop = db.loc[tax_plotting[t], samples[s]+'_reads']
cat = total_reads.loc[samples[s], 'cat_reads']
prop = (prop/cat)*100
ax_using.bar(x[s], prop, bottom=bottom, color=color_plotting[t], edgecolor='k', width=0.3)
bottom += prop
ax2.legend(handles=handles, bbox_to_anchor=(1,1.05))
for ax in axs:
plt.sca(ax)
plt.xticks(x1, ['' for x in x1])
plt.ylim([0, 100])
plt.xlim([-0.5, 20.5])
for x in x1:
ax5.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
ax4.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
ax1.set_ylabel('Classified (%)'), ax3.set_ylabel('Classified (%)'), ax5.set_ylabel('Classified(%)')
#plt.tight_layout()
return
def get_summary_bacteria(kraken_db):
tax_plotting = ['Bacteria']
alpha = ['#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F']
fig = plt.figure(figsize=(10,8))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
axs = [ax1, ax2, ax3, ax4]
for db in range(len(kraken_db)):
ax_using = ax_plot[db]
x = x_plot[db]
alp = alpha[db]
db = kraken_db[db]
for tax in range(len(tax_plotting)):
tax = tax_plotting[tax]
if tax not in list(db.index.values):
db.loc[tax] = [0 for i in range(db.shape[1])]
db = db.fillna(value=0)
for s in range(len(samples)):
bottom = 0
for t in range(len(tax_plotting)):
prop = db.loc[tax_plotting[t], samples[s]+'_reads']
cat = total_reads.loc[samples[s], 'cat_reads']
ax_using.bar(x[s], prop, bottom=bottom, color=alp, edgecolor='k', width=0.3)
bottom += prop
handles = []
handles.append(Patch(facecolor=alpha[0], edgecolor='k', label='No confidence value'))
handles.append(Patch(facecolor=alpha[1], edgecolor='k', label='Confidence=0.1'))
ax2.legend(handles=handles, bbox_to_anchor=(1.6,1.03))
for ax in axs:
plt.sca(ax)
plt.xticks(x1, ['' for x in x1])
plt.semilogy()
plt.xlim([-0.5, 20.5])
#plt.ylim([0, 100])
plt.xlim([-0.5, 20.5])
for x in x1:
pl = ((1/21)*(x+1))-0.02
ax3.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax3.transAxes)
ax4.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax4.transAxes)
ax1.set_ylabel('Number of reads'), ax3.set_ylabel('Number of reads')
#plt.tight_layout()
return
A summary of the percentage of reads classified as different domains with different databases. Note that the ‘From paper’ plot uses the classifications given in the original paper, where 10,000 unmapped reads were classified using BLAST searches of the NCBI database.
#18
get_summary_reads(kraken_all_db)
plt.show()
Summary of the number of reads that are classified as bacteria by each database.
#19
get_summary_bacteria(kraken_all_db)
plt.tight_layout()
plt.show()
These first plots are all separately with the confidence parameter set. See the last tab for those without the confidence parameter set.
#20
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []
for db in range(len(bracken_all_db)):
d = int(db)
db = bracken_all_db[db]
species = list(db.index.values)
keeping = []
species_dict = {}
for sp in species:
if sp in bacteria:
keeping.append(True)
new_sp = sp.split('__')
if len(new_sp) > 1:
new_sp = new_sp[1]
else:
new_sp = new_sp[0]
species_dict[sp] = new_sp.split(' ')[0].replace("'", '')
else:
keeping.append(False)
in_len = db.shape[0]
db = db.loc[keeping, :]
strain.append(db)
db = db.rename(index=species_dict)
db = db.groupby(by=db.index, axis=0).sum()
sums = db.sum(axis=0)
gen_sums.append(sums)
db = db.divide(sums, axis=1).multiply(100)
genera.append(db)
db.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/'+db_names[d]+'.csv')
gen_names = gen_names+list(db.index.values)
db = db[db.max(axis=1) > 1]
genera_1.append(db)
gen_names_1 = gen_names_1+list(db.index.values)
gen_names = list(set(gen_names))
gen_names_1 = list(set(gen_names_1))
#21
def plot_four_nmds(dbs, metric, name):
fig = plt.figure(figsize=(15,10))
#fig.suptitle(name+metric+'\n\n\n')
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axs = [ax3, ax1, ax2, ax4]
ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
for db in range(len(dbs)):
n = db
db = dbs[db].transpose()
pos, npos, stress = transform_for_NMDS(db, metric)
for a in range(len(npos)):
axs[n].scatter(npos[a,0], npos[a,1], marker=shapes[a], color=colors[a], s=100, edgecolor='k')
axs[n].set_xlabel('nMDS1')
axs[n].set_ylabel('nMDS2')
handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]
ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
plt.tight_layout()
return
#22
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'braycurtis', 'NMDS confidence=0.1 strain ')
plt.show()
#23
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'braycurtis', 'NMDS confidence=0.1 genera ')
plt.show()
#24
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'euclidean', 'NMDS confidence=0.1 strain ')
plt.show()
#25
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'euclidean', 'NMDS confidence=0.1 genera ')
plt.show()
#26
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'jaccard', 'NMDS confidence=0.1 strain ')
plt.show()
#27
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'jaccard', 'NMDS confidence=0.1 genera ')
plt.show()
Bray-curtis distance at strain level
#28
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'braycurtis', 'NMDS no confidence strain ')
plt.show()
Bray-curtis distance at genus level
#29
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'braycurtis', 'NMDS no confidence genera ')
plt.show()
Euclidean distance at strain level
#30
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'euclidean', 'NMDS no confidence strain ')
plt.show()
Euclidean distance at genus level
#31
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'euclidean', 'NMDS no confidence genera ')
plt.show()
Jaccard distance at strain level
#32
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'jaccard', 'NMDS no confidence strain ')
plt.show()
Jaccard distance at genus level
#33
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'jaccard', 'NMDS no confidence genera ')
plt.show()
These plots are now only calculated for the classifications that used confidence = 0.1. Genera with below 1% maximum relative abundance are removed and the numbers in brackets are the number of reads that were classified as bacteria.
#34
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
#bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
#genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []
gen_names_1 = sorted(gen_names_1)
def plot_genera(db, sums, tax_cols, gen_names_1, dname):
plt.figure(figsize=(10,5))
ax1 = plt.subplot(111)
bottom = [0 for x in range(len(db.columns))]
handles = []
for g in range(len(gen_names_1)):
if gen_names_1[g] in db.index.values:
this_row = db.loc[gen_names_1[g], :].values
ax1.bar(x1, this_row, bottom=bottom, color=tax_cols[g], edgecolor='k')
handles.append(Patch(facecolor=tax_cols[g], edgecolor='k', label=gen_names_1[g]))
for b in range(len(bottom)):
bottom[b] += this_row[b]
ax1.legend(handles=handles, bbox_to_anchor=(1, 1.03), ncol=3)
plt.xticks(x1, ['' for x in x1])
plt.ylabel('Relative abundance(%)')
plt.xlim([-0.5, 20.5])
plt.ylim([0, 100])
for x in x1:
n = str(int(sums[samples[x]]))
ax1.text(x, -2, sample_names[x]+' ('+n+')', color=colors[x], rotation=90, va='top', ha='center')
plt.tight_layout()
return
gen_plot = [genera_1[1], genera_1[3], genera_1[5], genera_1[7]]
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
all_sums = [gen_sums[1], gen_sums[3], gen_sums[5], gen_sums[7]]
tax_cols = get_cols(len(gen_names_1))
#35
plot_genera(gen_plot[1], all_sums[1], tax_cols, gen_names_1, db_name[1])
plt.show()
#36
plot_genera(gen_plot[2], all_sums[2], tax_cols, gen_names_1, db_name[2])
plt.show()
#37
plot_genera(gen_plot[0], all_sums[0], tax_cols, gen_names_1, db_name[0])
plt.show()
#38
plot_genera(gen_plot[3], all_sums[3], tax_cols, gen_names_1, db_name[3])
plt.show()
Here I have carried out ANCOM2 tests for differential abundance of genera between body sites. All genera are then plotted on a heatmap with mean values for each body site and stars to denote significant differences as determined by ANCOM.
While many of these results vary between databases, it is worth noting that some differences are consistent, e.g.:
1. Prevotella are always more abundant in saliva samples
2. Streptococcus are always more abundant in buccal samples
3. Klebsiella (where present) are always more abundant in blood samples
#39
def get_differences(new_genus, db_name):
new_genus = new_genus.drop(['SRR8595488'], axis=1)
new_genus = new_genus[new_genus.max(axis=1) > 0.1]
samples = list(new_genus.columns)
list_comparisons = [['Blood', 'Saliva'], ['Blood', 'Buccal'], ['Saliva', 'Buccal'], ['Saliva', 'Saliva_euk'], ['Buccal', 'Buccal_euk'], ['Blood', 'Saliva_euk'], ['Blood', 'Buccal_euk'], ['Saliva_euk', 'Buccal_euk']]
comparisons, metadata, comp_len = [], [], []
for a in range(len(list_comparisons)):
keeping = []
this_md = []
for b in range(len(samples)):
if site_dict[samples[b]] in list_comparisons[a]:
keeping.append(True)
this_md.append([samples[b], site_dict[samples[b]]])
else:
keeping.append(False)
this_comp = new_genus.loc[:, keeping]
this_comp = this_comp[this_comp.max(axis=1) > 0.1]
comparisons.append(this_comp)
this_md = pd.DataFrame(this_md, columns=['Samples', 'Groups'])
#this_md.set_index('Samples', inplace=True)
metadata.append(this_md)
comp_len.append(this_comp.shape[0])
new_genus = new_genus.rename(columns=site_dict, inplace=False)
return comparisons, metadata, new_genus, comp_len
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
comparison_names = [r'Blood vs saliva', r'Blood vs buccal', r'Saliva vs buccal', r'Saliva vs saliva_euk', r'Buccal vs buccal_euk', r'Blood vs saliva_euk', r'Blood vs buccal_euk', r'Saliva_euk vs buccal_euk']
source("/Users/robynwright/Documents/OneDrive/Github/R-notebooks/ancom_v2.1.R")
get_ancom <- function(ft, md) {
all_ancom = list()
for (a in 1:8){
feature_table = ft[a]
meta_data = md[a]
process = feature_table_pre_process(feature_table, meta_data, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
ancom_out = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
all_ancom[[a]] <- ancom_out$out
}
return(all_ancom)
}
def sort_ancom_results(r_ancom):
ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06 = [], [], [], []
for a in range(len(r_ancom)):
this_sig_09, this_sig_08, this_sig_07, this_sig_06 = [], [], [], []
r_ancom[a].set_index('taxa_id', inplace=True)
all_sp = list(r_ancom[a].index.values)
for b in range(len(all_sp)):
if r_ancom[a].loc[all_sp[b], 'detected_0.9'] == True:
this_sig_09.append(all_sp[b])
if r_ancom[a].loc[all_sp[b], 'detected_0.8'] == True:
this_sig_08.append(all_sp[b])
if r_ancom[a].loc[all_sp[b], 'detected_0.7'] == True:
this_sig_07.append(all_sp[b])
if r_ancom[a].loc[all_sp[b], 'detected_0.6'] == True:
this_sig_06.append(all_sp[b])
ancom_lists_09.append(this_sig_09), ancom_lists_08.append(this_sig_08), ancom_lists_07.append(this_sig_07), ancom_lists_06.append(this_sig_06)
return [ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06]
def plot_heatmap(new_genus, ANCOM):
print(ANCOM)
names = ['Blood', 'Saliva', 'Buccal', 'Saliva_euk', 'Buccal_euk']
other_names = ['Abundance', r'Blood $vs$ saliva', r'Blood $vs$ buccal', r'Saliva $vs$ buccal', r'Saliva $vs$ saliva_euk', r'Buccal $vs$ buccal_euk', r'Blood $vs$ saliva_euk', r'Blood $vs$ buccal_euk', r'Saliva_euk $vs$ buccal_euk']
colormap, norm = mpl.cm.get_cmap('plasma', 256), mpl.colors.Normalize(vmin=0, vmax=1)
m = mpl.cm.ScalarMappable(norm=norm, cmap=colormap)
if list(new_genus.index.values)[0][0] == 'A':
new_genus = new_genus.iloc[::-1]
figure = plt.figure(figsize=(5,new_genus.shape[0]*0.2))
ax1 = plt.subplot(111)
genus_names = list(new_genus.index.values)
y = []
for g in range(len(genus_names)):
this_row = new_genus.loc[genus_names[g], :]
values = [(np.mean(this_row[name].values)) for name in names]
bottom, top, x = [g for a in range(5)], [1 for a in range(5)], [a for a in range(5)]
y.append(g+0.5)
ma = max(values)
values = [v/ma for v in values]
values = [m.to_rgba(v) for v in values]
ax1.bar(x, top, bottom=bottom, color=values, edgecolor='k', width=1)
x.append(5)
ax1.scatter(x[-1], bottom[-1]+0.5, color='k', s=ma*5)
sig, x_plt = [], x[-1]
for a in range(len(ANCOM)):
x_plt += 0.75
if genus_names[g] in ANCOM[a]: ax1.scatter(x_plt, bottom[-1]+0.5, marker='*', color='k')
x.append(x_plt)
plt.ylim([0, len(genus_names)]), plt.xlim([-0.5, x[-1]+0.5])
plt.yticks(y, genus_names)
plt.xticks(x, names+other_names, rotation=90)
ax1.xaxis.set_ticks_position('top')
plt.tight_layout()
plt.show()
return
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 41 | 41 | 37 | 26 | 8 |
| Blood vs buccal | 37 | 27 | 21 | 15 | 3 |
| Saliva vs buccal | 45 | 3 | 3 | 2 | 1 |
| Saliva vs saliva_euk | 41 | 0 | 0 | 0 | 0 |
| Buccal vs buccal_euk | 39 | 2 | 0 | 0 | 0 |
| Blood vs saliva_euk | 38 | 32 | 24 | 15 | 3 |
| Blood vs buccal_euk | 34 | 11 | 5 | 2 | 2 |
| Saliva_euk vs buccal_euk | 43 | 5 | 3 | 2 | 1 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
## [['Burkholderia', 'Enterobacter', 'Klebsiella', 'Paracoccus', 'Pasteurella', 'Prevotella', 'Rothia', 'Staphylococcus'], ['Burkholderia', 'Rothia', 'Streptococcus'], ['Prevotella'], [], [], ['Burkholderia', 'Prevotella', 'Rothia'], ['Rothia', 'Streptococcus'], ['Prevotella']]
##
## <string>:9: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_human_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 49 | 48 | 47 | 38 | 9 |
| Blood vs buccal | 48 | 35 | 16 | 14 | 4 |
| Saliva vs buccal | 39 | 5 | 4 | 1 | 0 |
| Saliva vs saliva_euk | 30 | 0 | 0 | 0 | 0 |
| Buccal vs buccal_euk | 37 | 0 | 0 | 0 | 0 |
| Blood vs saliva_euk | 49 | 47 | 33 | 22 | 5 |
| Blood vs buccal_euk | 50 | 19 | 14 | 12 | 4 |
| Saliva_euk vs buccal_euk | 42 | 4 | 3 | 3 | 2 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
## [['Capnocytophaga', 'Clostridium', 'Klebsiella', 'Mogibacterium', 'Prevotella', 'Pseudomonas', 'Rothia', 'Streptomyces', 'Veillonella'], ['Klebsiella', 'Rothia', 'Streptococcus', 'Streptomyces'], [], [], [], ['Campylobacter', 'Klebsiella', 'Prevotella', 'Pseudomonas', 'Streptomyces'], ['Klebsiella', 'Rothia', 'Streptococcus', 'Streptomyces'], ['Prevotella', 'Streptococcus']]
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/gtdb_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 98 | 88 | 82 | 32 | 6 |
| Blood vs buccal | 91 | 21 | 14 | 8 | 5 |
| Saliva vs buccal | 96 | 18 | 10 | 2 | 1 |
| Saliva vs saliva_euk | 93 | 3 | 2 | 0 | 0 |
| Buccal vs buccal_euk | 95 | 8 | 5 | 1 | 0 |
| Blood vs saliva_euk | 92 | 27 | 17 | 11 | 4 |
| Blood vs buccal_euk | 75 | 7 | 7 | 3 | 2 |
| Saliva_euk vs buccal_euk | 97 | 19 | 13 | 7 | 1 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
## [['Mycobacterium', 'Pauljensenia', 'Prevotella', 'Psychrobacter', 'Streptococcus', 'Veillonella'], ['Actinomyces', 'Prevotella', 'Rothia', 'Streptococcus', 'Veillonella'], ['Prevotella'], [], [], ['Acinetobacter', 'Prevotella', 'Streptococcus', 'Veillonella'], ['Acinetobacter', 'Poseidonibacter'], ['Prevotella']]
These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.
this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/refseq_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
kable() %>%
kable_styling()
| Comparison | Shared genera | ANCOM 0.6 | ANCOM 0.7 | ANCOM 0.8 | ANCOM 0.9 |
|---|---|---|---|---|---|
| Blood vs saliva | 85 | 81 | 77 | 35 | 11 |
| Blood vs buccal | 82 | 65 | 36 | 21 | 7 |
| Saliva vs buccal | 69 | 9 | 6 | 3 | 0 |
| Saliva vs saliva_euk | 64 | 0 | 0 | 0 | 0 |
| Buccal vs buccal_euk | 76 | 4 | 4 | 1 | 0 |
| Blood vs saliva_euk | 86 | 72 | 37 | 20 | 4 |
| Blood vs buccal_euk | 79 | 18 | 15 | 8 | 2 |
| Saliva_euk vs buccal_euk | 84 | 12 | 10 | 5 | 1 |
Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).
plot_heatmap(new_genus, ancom[0])
## [['Bordetella', 'Enterococcus', 'Escherichia', 'Klebsiella', 'Mesorhizobium', 'Mycobacterium', 'Prevotella', 'Pseudomonas', 'Salmonella', 'Staphylococcus', 'Veillonella'], ['Actinomyces', 'Mycobacterium', 'Prevotella', 'Pseudomonas', 'Rothia', 'Streptococcus', 'Veillonella'], [], [], [], ['Mycobacterium', 'Prevotella', 'Pseudomonas', 'Veillonella'], ['Actinomyces', 'Veillonella'], ['Prevotella']]
Now we are looking at the taxa (grouped to genus level, to hopefully allow for differences between databases) that are in common between the different databases that have been used with Kraken2. We are also using only the versions that have confidence=0.1.
The plots below here show the number of reads in each sample before and after removal of the genera that aren’t present in all databases.
#print(bracken_all_db)
def intersection(lst1, lst2):
lst3 = [value for value in lst1 if value in lst2]
return lst3
conf_bracken_genus = []
genus_in_each = []
for a in range(len(bracken_all_db)):
if a not in [0, 2, 4, 6]:
continue
this_db = pd.DataFrame(bracken_all_db[a])
taxa = list(this_db.index.values)
tax_dict = {}
for b in range(len(taxa)):
orig_tax = taxa[b]
if 's__' in taxa[b]:
taxa[b] = taxa[b].replace('s__', '')
taxa[b] = taxa[b].split(' ')[0]
taxa[b] = taxa[b].replace("'", "")
tax_dict[orig_tax] = taxa[b]
this_db.rename(index=tax_dict, inplace=True)
genus = this_db.groupby(by=this_db.index, axis=0).sum()
conf_bracken_genus.append(genus)
genus_in_each.append(list(genus.index.values))
[print(len(gen)) for gen in genus_in_each]
## 7452
## 1717
## 1304
## 4736
## [None, None, None, None]
overall_genus = intersection(genus_in_each[0], genus_in_each[1])
overall_genus = intersection(overall_genus, genus_in_each[2])
overall_genus = intersection(overall_genus, genus_in_each[3])
print(len(overall_genus))
## 1054
fig = plt.figure(figsize=(10,6))
ax = [plt.subplot(223), plt.subplot(221), plt.subplot(222), plt.subplot(224)]
x1 = [x for x in range(21)]
x2 = [x+0.4 for x in range(21)]
x3 = [x+0.2 for x in range(21)]
conf_bracken_overall = []
names = ['gtdb', 'minikrakenv1', 'minikrakenv2', 'refseq']
labels = ['GTDB', 'Minikraken v1 (without human)', 'Minikraken v2 (with human)', 'RefSeq Complete v93']
sum_reduced = []
for db in range(len(conf_bracken_genus)):
new_db = pd.DataFrame(conf_bracken_genus[db].loc[overall_genus, :])
conf_bracken_overall.append(new_db)
new_db.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera_in_common/'+names[db]+'_common.csv')
conf_bracken_genus[db].to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera_in_common/'+names[db]+'.csv')
sums = new_db.sum(axis=0)
sum_reduced.append(sums)
ax[db].bar(x2, sums, color=colors, edgecolor='k', width=0.4, alpha=0.5)
sums = conf_bracken_genus[db].sum(axis=0)
ax[db].bar(x1, sums, color=colors, edgecolor='k', width=0.4)
ax[db].semilogy()
ax[db].set_title(labels[db])
plt.sca(ax[db])
if db == 1 or db == 2:
plt.xticks([])
else:
plt.xticks(x3, sample_names, rotation=90)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## []
## Text(0.5, 1.0, 'GTDB')
## ([<matplotlib.axis.XTick object at 0x1262b1100>, <matplotlib.axis.XTick object at 0x1262b10d0>, <matplotlib.axis.XTick object at 0x1262906d0>, <matplotlib.axis.XTick object at 0x125df3760>, <matplotlib.axis.XTick object at 0x125df3b50>, <matplotlib.axis.XTick object at 0x125dc4130>, <matplotlib.axis.XTick object at 0x125dc46d0>, <matplotlib.axis.XTick object at 0x125dc4c70>, <matplotlib.axis.XTick object at 0x125dc3250>, <matplotlib.axis.XTick object at 0x125dc37f0>, <matplotlib.axis.XTick object at 0x125dc3d90>, <matplotlib.axis.XTick object at 0x125dcf370>, <matplotlib.axis.XTick object at 0x125dcf910>, <matplotlib.axis.XTick object at 0x125dc32e0>, <matplotlib.axis.XTick object at 0x125dc4190>, <matplotlib.axis.XTick object at 0x125dcf460>, <matplotlib.axis.XTick object at 0x125dd1130>, <matplotlib.axis.XTick object at 0x125dd16d0>, <matplotlib.axis.XTick object at 0x125dd1c70>, <matplotlib.axis.XTick object at 0x125dc0250>, <matplotlib.axis.XTick object at 0x125dc07f0>], <a list of 21 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## []
## Text(0.5, 1.0, 'Minikraken v1 (without human)')
## ([], <a list of 0 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## []
## Text(0.5, 1.0, 'Minikraken v2 (with human)')
## ([], <a list of 0 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## []
## Text(0.5, 1.0, 'RefSeq Complete v93')
## ([<matplotlib.axis.XTick object at 0x125d29a00>, <matplotlib.axis.XTick object at 0x125d299d0>, <matplotlib.axis.XTick object at 0x125d48fd0>, <matplotlib.axis.XTick object at 0x12a8fea60>, <matplotlib.axis.XTick object at 0x12a8f9040>, <matplotlib.axis.XTick object at 0x12a8f95e0>, <matplotlib.axis.XTick object at 0x12a8f9b80>, <matplotlib.axis.XTick object at 0x12a8e5160>, <matplotlib.axis.XTick object at 0x12a8e5700>, <matplotlib.axis.XTick object at 0x12a8f97c0>, <matplotlib.axis.XTick object at 0x12a8e5ac0>, <matplotlib.axis.XTick object at 0x12a8e5df0>, <matplotlib.axis.XTick object at 0x12a9073d0>, <matplotlib.axis.XTick object at 0x12a907970>, <matplotlib.axis.XTick object at 0x12a907f10>, <matplotlib.axis.XTick object at 0x12a8e44f0>, <matplotlib.axis.XTick object at 0x12a8e4a90>, <matplotlib.axis.XTick object at 0x12a8f5070>, <matplotlib.axis.XTick object at 0x12a8f5610>, <matplotlib.axis.XTick object at 0x12a907fa0>, <matplotlib.axis.XTick object at 0x12a8e5340>], <a list of 21 Text xticklabel objects>)
ax[0].set_ylabel('Number of reads'), ax[1].set_ylabel('Number of reads')
## (Text(0, 0.5, 'Number of reads'), Text(0, 0.5, 'Number of reads'))
handles = [Patch(facecolor='k', edgecolor='k', label='All genera'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Genera present in all')]
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()
fig = plt.figure(figsize=(8,4))
ax = plt.subplot(111)
x1 = [x*5 for x in range(21)]
x2 = [x+1 for x in x1]
x3 = [x+1 for x in x2]
x4 = [x+1 for x in x3]
xplt = [x+0.5 for x in x2]
x = [x3, x1, x2, x4]
al = [1, 0.8, 0.6, 0.4]
for db in range(len(sum_reduced)):
ax.bar(x[db], sum_reduced[db], color=colors, width=1, edgecolor='k', alpha=al[db])
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
handles = [Patch(facecolor='k', edgecolor='k', label='Minikraken v1'), Patch(facecolor='k', edgecolor='k', alpha=0.8, label='Minikraken v2'), Patch(facecolor='k', edgecolor='k', alpha=0.6, label='GTDB'), Patch(facecolor='k', edgecolor='k', alpha=0.4, label='RefSeq Complete v93')]
ax.legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1))
plt.xticks(xplt, sample_names, rotation=90)
## ([<matplotlib.axis.XTick object at 0x1287cea00>, <matplotlib.axis.XTick object at 0x1287cebb0>, <matplotlib.axis.XTick object at 0x1287d4640>, <matplotlib.axis.XTick object at 0x12a8e4670>, <matplotlib.axis.XTick object at 0x12a907f10>, <matplotlib.axis.XTick object at 0x12a8f98e0>, <matplotlib.axis.XTick object at 0x12a8e5460>, <matplotlib.axis.XTick object at 0x12a8e5340>, <matplotlib.axis.XTick object at 0x128e68c70>, <matplotlib.axis.XTick object at 0x12a8e5a00>, <matplotlib.axis.XTick object at 0x12a907250>, <matplotlib.axis.XTick object at 0x127c257f0>, <matplotlib.axis.XTick object at 0x127c25670>, <matplotlib.axis.XTick object at 0x128e77940>, <matplotlib.axis.XTick object at 0x126286a00>, <matplotlib.axis.XTick object at 0x126286d30>, <matplotlib.axis.XTick object at 0x1262b11c0>, <matplotlib.axis.XTick object at 0x1262b1f70>, <matplotlib.axis.XTick object at 0x128e7a280>, <matplotlib.axis.XTick object at 0x1262b1a00>, <matplotlib.axis.XTick object at 0x128e77190>], <a list of 21 Text xticklabel objects>)
plt.semilogy()
## []
plt.ylabel('Number of reads')
plt.tight_layout()
plt.show()
### Genera stacked bars (absolute abundance)
for db in range(len(conf_bracken_overall)):
rename_samples = {}
for sample in list(conf_bracken_overall[db].columns):
rename_samples[sample] = sample+'_'+names[db]
this_db = pd.DataFrame(conf_bracken_overall[db].rename(columns=rename_samples))
if db == 0:
overall_genus = this_db
else:
overall_genus = pd.concat([overall_genus, this_db], axis=1)
overall_genus = pd.DataFrame(overall_genus)
#overall_genus = pd.DataFrame(overall_genus.divide(overall_genus.sum(axis=0)).multiply(100))
overall_genus = overall_genus[overall_genus.max(axis=1) > 25000]
sums = overall_genus.sum(axis=0)
gen_colors = get_cols(overall_genus.shape[0])
print(overall_genus)
## SRR8595488_gtdb ... SRR8595509_refseq
## name ...
## Achromobacter 6731.0 ... 11.0
## Acidisphaera 602.0 ... 3.0
## Acinetobacter 5068.0 ... 4597.0
## Actinomyces 340.0 ... 4126.0
## Aeromonas 802.0 ... 0.0
## ... ... ... ...
## Veillonella 18.0 ... 20951.0
## Vibrio 232.0 ... 1978.0
## Weissella 24.0 ... 3.0
## Xanthomonas 14.0 ... 227.0
## Xylanimonas 0.0 ... 2.0
##
## [112 rows x 84 columns]
fig = plt.figure(figsize=(14,10))
ax1 = plt.subplot(223)
ax = [ax1, plt.subplot(221, sharey=ax1), plt.subplot(222, sharey=ax1), plt.subplot(224, sharey=ax1)]
x1 = [x for x in range(21)]
a = 0
col_samples = list(overall_genus.columns)
#line 20
new_db = []
for name in names:
keeping = []
for s in list(overall_genus.columns):
if name in s:
keeping.append(True)
else:
keeping.append(False)
other_new_db = pd.DataFrame(overall_genus.loc[:, keeping])
new_db.append(other_new_db)
for db in range(len(new_db)):
this_genera = list(new_db[db].index.values)
handles = []
for g in range(len(this_genera)):
if g == 0:
bottom = [0 for c in x1]
these_values = new_db[db].loc[this_genera[g], :].values
ax[db].bar(x1, these_values, bottom=bottom, color=gen_colors[g], edgecolor='k', width=1)
handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=this_genera[g]))
bottom = [bottom[x]+these_values[x] for x in range(len(bottom))]
plt.sca(ax[db])
if db == 1 or db == 2:
plt.xticks([])
else:
plt.xticks(x1, sample_names, rotation=90)
plt.semilogy()
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([<matplotlib.axis.XTick object at 0x126845d30>, <matplotlib.axis.XTick object at 0x126845d00>, <matplotlib.axis.XTick object at 0x1268455b0>, <matplotlib.axis.XTick object at 0x12a67c910>, <matplotlib.axis.XTick object at 0x12a67ceb0>, <matplotlib.axis.XTick object at 0x12a686490>, <matplotlib.axis.XTick object at 0x12a686a30>, <matplotlib.axis.XTick object at 0x12a686fd0>, <matplotlib.axis.XTick object at 0x12a68b5b0>, <matplotlib.axis.XTick object at 0x12a68bb50>, <matplotlib.axis.XTick object at 0x12a690130>, <matplotlib.axis.XTick object at 0x12a68b6d0>, <matplotlib.axis.XTick object at 0x12a686160>, <matplotlib.axis.XTick object at 0x12a690490>, <matplotlib.axis.XTick object at 0x12a690a30>, <matplotlib.axis.XTick object at 0x12a690fd0>, <matplotlib.axis.XTick object at 0x12a6955b0>, <matplotlib.axis.XTick object at 0x12a695b50>, <matplotlib.axis.XTick object at 0x12a69a130>, <matplotlib.axis.XTick object at 0x12a69a6d0>, <matplotlib.axis.XTick object at 0x12a695790>], <a list of 21 Text xticklabel objects>)
## []
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([], <a list of 0 Text xticklabel objects>)
## []
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([], <a list of 0 Text xticklabel objects>)
## []
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([<matplotlib.axis.XTick object at 0x1294fa400>, <matplotlib.axis.XTick object at 0x1294fa430>, <matplotlib.axis.XTick object at 0x129504f40>, <matplotlib.axis.XTick object at 0x130bc47f0>, <matplotlib.axis.XTick object at 0x130bc4d90>, <matplotlib.axis.XTick object at 0x130bcc370>, <matplotlib.axis.XTick object at 0x130bcc910>, <matplotlib.axis.XTick object at 0x130bcceb0>, <matplotlib.axis.XTick object at 0x130bd1490>, <matplotlib.axis.XTick object at 0x130bd1a30>, <matplotlib.axis.XTick object at 0x130bd1fd0>, <matplotlib.axis.XTick object at 0x130bd1070>, <matplotlib.axis.XTick object at 0x130bc4f70>, <matplotlib.axis.XTick object at 0x130bd4370>, <matplotlib.axis.XTick object at 0x130bd4910>, <matplotlib.axis.XTick object at 0x130bd4eb0>, <matplotlib.axis.XTick object at 0x130bdb490>, <matplotlib.axis.XTick object at 0x130bdba30>, <matplotlib.axis.XTick object at 0x130bdbfd0>, <matplotlib.axis.XTick object at 0x130be15b0>, <matplotlib.axis.XTick object at 0x130bdb670>], <a list of 21 Text xticklabel objects>)
## []
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1), ncol=3)
plt.tight_layout()
plt.show()
## /Library/Frameworks/R.framework/Versions/3.6/Resources/library/reticulate/python/rpytools/call.py:13: UserWarning: Attempted to set non-positive bottom ylim on a log-scaled axis.
## Invalid limit will be ignored.
## res = rpycall.call_r_function(f, *args, **kwargs)
## /Library/Frameworks/R.framework/Versions/3.6/Resources/library/reticulate/python/rpytools/call.py:13: UserWarning: Attempting to set identical bottom == top == 1.0 results in singular transformations; automatically expanding.
## res = rpycall.call_r_function(f, *args, **kwargs)
### Genera stacked bars (relative abundance)
for db in range(len(conf_bracken_overall)):
rename_samples = {}
for sample in list(conf_bracken_overall[db].columns):
rename_samples[sample] = sample+'_'+names[db]
this_db = pd.DataFrame(conf_bracken_overall[db].rename(columns=rename_samples))
if db == 0:
overall_genus = this_db
else:
overall_genus = pd.concat([overall_genus, this_db], axis=1)
overall_genus = pd.DataFrame(overall_genus)
overall_genus = pd.DataFrame(overall_genus.divide(overall_genus.sum(axis=0)).multiply(100))
overall_genus = overall_genus[overall_genus.max(axis=1) > 1]
sums = overall_genus.sum(axis=0)
gen_colors = get_cols(overall_genus.shape[0])
print(overall_genus)
## SRR8595488_gtdb ... SRR8595509_refseq
## name ...
## Achromobacter 3.474961 ... 0.004477
## Acinetobacter 2.616417 ... 1.870874
## Actinomyces 0.175529 ... 1.679188
## Alcanivorax 2.164171 ... 0.001628
## Amycolatopsis 0.926174 ... 0.015465
## ... ... ... ...
## Thermosipho 0.005679 ... 0.000407
## Treponema 0.001549 ... 0.015872
## Veillonella 0.009293 ... 8.526580
## Weissella 0.012390 ... 0.001221
## Xanthomonas 0.007228 ... 0.092384
##
## [64 rows x 84 columns]
fig = plt.figure(figsize=(14,10))
ax1 = plt.subplot(223)
ax = [ax1, plt.subplot(221, sharey=ax1), plt.subplot(222, sharey=ax1), plt.subplot(224, sharey=ax1)]
x1 = [x for x in range(21)]
a = 0
col_samples = list(overall_genus.columns)
#line 20
new_db = []
for name in names:
keeping = []
for s in list(overall_genus.columns):
if name in s:
keeping.append(True)
else:
keeping.append(False)
other_new_db = pd.DataFrame(overall_genus.loc[:, keeping])
new_db.append(other_new_db)
for db in range(len(new_db)):
this_genera = list(new_db[db].index.values)
handles = []
for g in range(len(this_genera)):
if g == 0:
bottom = [0 for c in x1]
these_values = new_db[db].loc[this_genera[g], :].values
ax[db].bar(x1, these_values, bottom=bottom, color=gen_colors[g], edgecolor='k', width=1)
handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=this_genera[g]))
bottom = [bottom[x]+these_values[x] for x in range(len(bottom))]
plt.sca(ax[db])
if db == 1 or db == 2:
plt.xticks([])
else:
plt.xticks(x1, sample_names, rotation=90)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([<matplotlib.axis.XTick object at 0x1294d8370>, <matplotlib.axis.XTick object at 0x1294d8340>, <matplotlib.axis.XTick object at 0x132921910>, <matplotlib.axis.XTick object at 0x12a1c2850>, <matplotlib.axis.XTick object at 0x12a1c20a0>, <matplotlib.axis.XTick object at 0x127c4ca60>, <matplotlib.axis.XTick object at 0x127c4c910>, <matplotlib.axis.XTick object at 0x127c4c130>, <matplotlib.axis.XTick object at 0x125d30ca0>, <matplotlib.axis.XTick object at 0x125d30c70>, <matplotlib.axis.XTick object at 0x125df3e50>, <matplotlib.axis.XTick object at 0x125df3280>, <matplotlib.axis.XTick object at 0x125df3400>, <matplotlib.axis.XTick object at 0x128654c70>, <matplotlib.axis.XTick object at 0x128654a60>, <matplotlib.axis.XTick object at 0x125df3a90>, <matplotlib.axis.XTick object at 0x125d301c0>, <matplotlib.axis.XTick object at 0x128654e20>, <matplotlib.axis.XTick object at 0x13129f1c0>, <matplotlib.axis.XTick object at 0x13129f6a0>, <matplotlib.axis.XTick object at 0x12a695490>], <a list of 21 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([], <a list of 0 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([], <a list of 0 Text xticklabel objects>)
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## <BarContainer object of 21 artists>
## ([<matplotlib.axis.XTick object at 0x1269ae520>, <matplotlib.axis.XTick object at 0x1269aea00>, <matplotlib.axis.XTick object at 0x1269ae580>, <matplotlib.axis.XTick object at 0x12e176c70>, <matplotlib.axis.XTick object at 0x12e166250>, <matplotlib.axis.XTick object at 0x12e1667f0>, <matplotlib.axis.XTick object at 0x12e166d90>, <matplotlib.axis.XTick object at 0x12e16b370>, <matplotlib.axis.XTick object at 0x12e16b910>, <matplotlib.axis.XTick object at 0x12e16beb0>, <matplotlib.axis.XTick object at 0x12e163490>, <matplotlib.axis.XTick object at 0x12e163a30>, <matplotlib.axis.XTick object at 0x12e16b580>, <matplotlib.axis.XTick object at 0x12e1663d0>, <matplotlib.axis.XTick object at 0x12e1635b0>, <matplotlib.axis.XTick object at 0x12e154370>, <matplotlib.axis.XTick object at 0x12e154910>, <matplotlib.axis.XTick object at 0x12e154eb0>, <matplotlib.axis.XTick object at 0x12e156490>, <matplotlib.axis.XTick object at 0x12e156a30>, <matplotlib.axis.XTick object at 0x12e156fd0>], <a list of 21 Text xticklabel objects>)
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1), ncol=3)
plt.tight_layout()
plt.show()
Function is profiled using:
1. HUMAnN2
- Uniref90 database
- Uniref50 database
2. HUMAnN3
- Uniref90 and Uniref50
- This is still running
2. MEGAHIT and the Anvi’o TARA oceans workflow
- This is still running
plt.figure(figsize=(8,6))
ax1 = plt.subplot(111)
#unaligned_after_translation_uniref_90
#unaligned_after_translation_uniref_50
x1 = [x for x in range(21)]
x2 = [x+0.4 for x in range(21)]
x = [x+0.2 for x in range(21)]
ax1.bar(x1, reads.loc[:, 'unaligned_after_translation_uniref_90'].values, color=colors, edgecolor='k', width=0.4)
ax1.bar(x2, reads.loc[:, 'unaligned_after_translation_uniref_50'].values, color=colors, edgecolor='k', width=0.4, alpha=0.5)
plt.xticks(x, sample_names, rotation=90)
plt.ylabel('Unaligned reads after translation (%)')
plt.xlim([-0.5,21])
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
ax1.legend(handles=handles, bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()
## HUMAnN2 richness {.tabset}
After running HUMAnN2, the pathabundance, pathcoverage and genefamilies tables are joined, and the pathabundance is renormalised (relative abundance calculated) and split to tables that are stratified by species or not. Here we use the unstratified pathabundance file (the MetaPhlAn2 species results don’t seem to have been particularly accurate). The genefamilies file is stratified by default so we take only the overall values for the gene family (removing all species results) and re-calculate these values as relative abundances.
def get_richness(df):
snames = list(df.columns)
num_genes = []
for sn in snames:
one_sample = pd.DataFrame(df.loc[:, sn])
one_sample = one_sample[one_sample.max(axis=1) > 0]
num_genes.append(one_sample.shape[0])
return snames, num_genes
genes_90 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/humann2_genefamilies.tsv', header=0, index_col=0, sep='\t')
genes_90.drop('UNMAPPED', axis=0, inplace=True)
gene_names = list(genes_90.index.values)
keeping = []
for gn in gene_names:
new_gn = gn.split('|')[0]
if gn == new_gn: keeping.append(True)
else: keeping.append(False)
genes_90_abs = genes_90.loc[keeping, :]
genes_90 = genes_90_abs.divide(genes_90_abs.sum(axis=0)).multiply(100)
pathways_90 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/humann2_pathabundance_relab_unstratified.tsv', header=0, index_col=0, sep='\t')
snames_genes_90, genes_90_richness = get_richness(genes_90)
snames_pways_90, pways_90_richness = get_richness(pathways_90)
genes_50 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_50/humann2_genefamilies.tsv', header=0, index_col=0, sep='\t')
genes_50.drop('UNMAPPED', axis=0, inplace=True)
gene_names = list(genes_50.index.values)
keeping = []
for gn in gene_names:
new_gn = gn.split('|')[0]
if gn == new_gn: keeping.append(True)
else: keeping.append(False)
genes_50_abs = genes_50.loc[keeping, :]
genes_50 = genes_50_abs.divide(genes_50_abs.sum(axis=0)).multiply(100)
pathways_50 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_50/humann2_pathabundance_relab_unstratified.tsv', header=0, index_col=0, sep='\t')
snames_genes_50, genes_50_richness = get_richness(genes_50)
snames_pways_50, pways_50_richness = get_richness(pathways_50)
plt.figure(figsize=(6,6))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)
x = [a for a in range(len(genes_90.columns))]
x1 = [a+0.4 for a in x]
ax1.bar(x, genes_90_richness, width=0.4, color=colors, edgecolor='k')
ax2.bar(x, genes_90_richness, width=0.4, color=colors, edgecolor='k')
ax1.bar(x1[:len(genes_50_richness)], genes_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
ax2.bar(x1[:len(genes_50_richness)], genes_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
plt.sca(ax1)
plt.xlim([-0.5,21]), plt.xticks([])
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
plt.legend(handles=handles, bbox_to_anchor=(1,1))
plt.ylabel('Gene family\nrichness')
plt.sca(ax2)
plt.semilogy()
plt.xlim([-0.5,21])
plt.ylabel('Gene family\nrichness (log scale)')
plt.xticks(x, sample_names, rotation=90)
plt.tight_layout()
plt.show()
#54
plt.figure(figsize=(6,6))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)
x = [a for a in range(len(genes_90.columns))]
x1 = [a+0.4 for a in x]
ax1.bar(x, pways_90_richness, width=0.4, color=colors, edgecolor='k')
ax1.bar(x1, pways_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
ax2.bar(x, pways_90_richness, width=0.4, color=colors, edgecolor='k')
ax2.bar(x1, pways_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
plt.sca(ax1)
plt.xticks([])
plt.ylabel('Pathway richness')
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
plt.legend(handles=handles, bbox_to_anchor=(1,1))
plt.xlim([-0.5,21])
plt.sca(ax2)
plt.semilogy()
plt.xlim([-0.5,21]), plt.xticks(x, sample_names, rotation=90)
plt.ylabel('Pathway richness (log scale)')
plt.tight_layout()
plt.show()
Now we are looking at the gene families and pathways that are differentially abundant between body sites, using the lists that are output by HUMAnN2.
Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.
def get_diff_abundant(genes, name_csv):
col_names = list(genes.columns)
col_names_dict = {}
for cn in col_names:
if 'SRR8595488' in cn:
genes.drop([cn], axis=1, inplace=True)
cn_new = cn.split('_')[0]
col_names_dict[cn] = site_dict[cn_new]
new_genes = genes.rename(columns=col_names_dict)
comparisons = [['Saliva', 'Blood'], ['Buccal', 'Blood'], ['Saliva', 'Buccal'], ['Saliva', 'Saliva_euk'], ['Buccal', 'Buccal_euk'], ['Blood', 'Saliva_euk'], ['Blood', 'Buccal_euk']]
fig = plt.figure(figsize=(15,8))
ax = [plt.subplot2grid((2,8), (0,1), colspan=2), plt.subplot2grid((2,8), (0,3), colspan=2), plt.subplot2grid((2,8), (0,5), colspan=2), plt.subplot2grid((2,8), (1,0), colspan=2), plt.subplot2grid((2,8), (1,2), colspan=2), plt.subplot2grid((2,8), (1,4), colspan=2), plt.subplot2grid((2,8), (1,6), colspan=2)]
for a in range(len(comparisons)):
ax[a].set_title(comparisons[a][0]+r' $vs$ '+comparisons[a][1])
this_comparison = new_genes.loc[:, comparisons[a]]
this_comparison = this_comparison[this_comparison.max(axis=1) > 0]
genes_comp = list(this_comparison.index.values)
fc, pval, colors_p, this_fc = [], [], [], []
for b in range(len(genes_comp)):
c1, c2 = list(this_comparison.loc[genes_comp[b], comparisons[a][0]]), list(this_comparison.loc[genes_comp[b], comparisons[a][1]])
c1, c2 = [0.000001 if x == 0 else x for x in c1], [0.000001 if x == 0 else x for x in c2]
c1, c2 = [math.log2(c1[c]) for c in range(len(c1))], [math.log2(c2[c]) for c in range(len(c2))]
t, p = stats.ttest_ind(c1, c2)
c1, c2 = np.median(c1), np.median(c2)
diff = c1/c2
if diff < 1 and diff > 0: diff = -(1/diff)
fc.append(diff), pval.append(p)
this_fc.append([genes_comp[b], diff, p])
colors_p_all = [colors_dict[comparisons[a][0]], colors_dict[comparisons[a][1]]]
for c in range(len(fc)):
if fc[c] >= 2 and pval[c] <= 0.05: colors_p.append(colors_p_all[1])
elif fc[c] <= -2 and pval[c] <= 0.05: colors_p.append(colors_p_all[0])
else: colors_p.append('#B1B1B1')
pval[c] = -math.log10(pval[c])
com = comparisons[a][0]+'_'+comparisons[a][1]
if a == 0:
df_pval = pd.DataFrame(this_fc, columns=['Genes/pathways', com+'_FC', com+'_p'])
df_pval.set_index('Genes/pathways', inplace=True)
else:
new_df = pd.DataFrame(this_fc, columns=['Genes/pathways', com+'_FC', com+'_p'])
new_df.set_index('Genes/pathways', inplace=True)
df_pval = pd.concat([df_pval, new_df], axis=1, join='outer')
ma = max([max(fc), abs(min(fc))])+0.5
if ma > 10: ma = 10.5
ax[a].set_xlim([-ma, ma])
ax[a].scatter(fc, pval, marker='o', c=colors_p, s=10)
ax[a].set_xlabel(r'Log$_2$ fold change')
if a == 0 or a == 3:
ax[a].set_ylabel('-log(p value)')
plt.sca(ax[a])
df_pval.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/'+name_csv+'.csv')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax[2].legend(handles=handles, bbox_to_anchor=(1.04,1), loc='upper left')
return
pways = pd.DataFrame(pathways_90)
get_diff_abundant(pways, 'pathways_90')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()
Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.
pways = pd.DataFrame(pathways_50)
get_diff_abundant(pways, 'pathways_50')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()
Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.
pways = pd.DataFrame(genes_90)
get_diff_abundant(pways, 'genes_90')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()
Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.
pways = pd.DataFrame(genes_50)
get_diff_abundant(pways, 'genes_50')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()